### ---- SCRIPT 03: The purpose of this script is to validate the ideal point estimates of a subset of  ---- ###
### ---- of political and media elites using estimates from a public survey. Where ideal points via     ---- ###   
### ---- social media networks are unavailable for certain parties and organisations, the mean estimate ---- ### 
### ---- of affiliated accounts is used instead.                                                        ---- ###

options(scipen=999)

#### ---- LIBRARIES ---- ####

library(dplyr) # data wrangling
library(ggplot2) # data visualisation
library(data.table) # working with big data
library(devtools) # installation of GGally from Github
# library(GGally) # for matrix plotting
library(ggrepel) # prvent overlapping labels in ggplot
library(foreign) # load SPSS files

#### ---- LOAD DATA ---- ####

# Guest masterlist with ideal points
guest_data <- fread("username_master_list_with_ideal_points.csv")

# Twitter user data 
user_data <- fread("user_ideal_point_data_updated.csv")

##YouGov survey data 
yougov_data <- fread("yougov_summary_data.csv")

#### ---- IDEAL POINT VALIDATION ---- #### 

# Generate a scatter plot matrix to evaluate the correlation  between a user's ideal point, their organisation's ideal point, and the mean 
# ideal point of organisation affiliates. 
add_regression_lines <- function(data, mapping, ...){
  p <- ggplot(data = data, mapping = mapping) + 
    geom_point() + 
    geom_smooth(method=loess, fill="red", color="red", ...) +
    geom_smooth(method=lm, fill="blue", color="blue", ...)
  p
}

scatter_matrix <- ggpairs(guest_data,columns = c(29,31:32), 
                          lower = list(continuous = add_regression_lines),
                          columnLabels = c("User","Organisation","Org Affiliates"))

ggsave("ideal_point_scatter_matrix.png",
       scatter_matrix,
       units="in", width=7, height=4, dpi=300,
       bg="white")

# Now, formally validate the ideal points of users and organisations against the mean ideological estimates provided by the YouGov survey.
# Firstly, merge the YouGov summary data to their ideal points in the Twitter user data by their username.

yougov_data <- user_data %>% 
  select(username,user_ideal_point) %>%
  right_join(yougov_data,by="username")

# Some organisations do not have an ideal point and so ideal points will be ascertained through the mean ideal point of affiliated accounts.
elite_users <- user_data %>% filter(elite_account == "Elite Account")

yougov_data$username_match <- ifelse(is.na(yougov_data$user_ideal_point), paste("@", yougov_data$username, sep = ""), NA)

affiliated_accounts_df <- data.frame()

for (row in 1:nrow(yougov_data)) {
  affiliated_accounts <- elite_users %>% 
    filter(grepl(yougov_data[row,]$username_match,
                 description,
                 ignore.case = TRUE))
  affiliated_accounts$name <- yougov_data[row,]$name
  affiliated_accounts_df <- rbind(affiliated_accounts_df,affiliated_accounts)
}

affiliated_accounts_summary <- affiliated_accounts_df %>%
  group_by(name) %>%
  summarise(count = n(),
            affiliate_mean_ideal_point = mean(user_ideal_point,na.rm = T),
            affiliate_median_ideal_point = median(user_ideal_point,na.rm = T),
            affiliate_ideal_point_sd = sd(user_ideal_point,na.rm = T))

# Merge the organisation affiliation summary data to the original YouGov data
yougov_data <- left_join(yougov_data,affiliated_accounts_summary,by="name")

# Where organisations do not have an ideal point, take the mean affiliate ideal point instead as proxy.
yougov_data$user_ideal_point <- ifelse(is.na(yougov_data$user_ideal_point),
                                       yougov_data$affiliate_mean_ideal_point,yougov_data$user_ideal_point)

# Test the strength of relationship between the Twitter ideal points and the YouGov survey estimates
cor(yougov_data$user_ideal_point,yougov_data$mean,method = "pearson") # pearson's r = 0.78
summary(lm(user_ideal_point~mean,yougov_data)) # r-sq = 0.61

plot(lm(user_ideal_point~mean,yougov_data))

# Plot the Twitter ideal point estimates against the YouGov mean ideology estimate for each account
# as a scatter plot, labelled with the name, and points shaped by account type
validation_scatterplot <- ggplot(yougov_data, 
                                 aes(x = user_ideal_point, y = mean,label = name, shape = type)) +
  geom_point() +  
  geom_text_repel(
    aes(label = name), size = 2, max.overlaps = 15) +
  geom_smooth(method = "lm",aes(group = 1)) +  
  labs(x = "Twitter Ideal Point", 
       y = "General Public Estimate",
       shape = "Type")  +
  theme_minimal() +
  theme(legend.position = "top") +
  annotate("text", x = 0.17, y = 8, label = paste("italic(R^2) ==",0.61), parse = TRUE) 

ggsave("yougov_validation_scatterplot.png",
       validation_scatterplot,
       units="in", width=7, height=4, dpi=300,
       bg="white")

#### ---- END ---- ####
